In [3]:
%matplotlib inline
import h5py
import numpy as np
import matplotlib.pyplot as plt


ds = 'digits/data/dataset.h5'
with h5py.File(ds, 'r') as hf:
    dataset = hf['dataset'][:]
    labels = hf['labels'][:]

Zeroes

In [5]:
dataset_0 = dataset[np.where(labels==0)]
labels_0 = labels[np.where(labels==0)]
In [24]:
r_index_0 = np.arange(labels_0.shape[0])
np.random.shuffle(r_index_0)
dataset_0 = dataset_0[r_index_0][:8000]
labels_0 = labels_0[r_index_0][:8000]
dataset_0.shape, labels_0.shape
Out[24]:
((8000, 784), (8000,))
In [25]:
index = 0
h, v = 22, 63
fig = plt.figure(figsize=(28, 12))
fig.subplots_adjust(hspace=0.1, wspace=0.1)
for i in range(1, h*v+1):
    test_image = dataset_0[(h*v+1)*index+i-1:(h*v+1)*index+i]
    ax = fig.add_subplot(h, v, i)
    ax.axis('off')
    ax.imshow(test_image.reshape(28, 28))

Ones

In [27]:
dataset_1 = dataset[np.where(labels==1)]
labels_1 = labels[np.where(labels==1)]
r_index_1 = np.arange(labels_1.shape[0])
np.random.shuffle(r_index_1)
dataset_1 = dataset_1[r_index_1][:8000]
labels_1 = labels_1[r_index_1][:8000]
dataset_1.shape, labels_1.shape
Out[27]:
((8000, 784), (8000,))
In [28]:
index = 0
h, v = 22, 63
fig = plt.figure(figsize=(28, 12))
fig.subplots_adjust(hspace=0.1, wspace=0.1)
for i in range(1, h*v+1):
    test_image = dataset_1[(h*v+1)*index+i-1:(h*v+1)*index+i]
    ax = fig.add_subplot(h, v, i)
    ax.axis('off')
    ax.imshow(test_image.reshape(28, 28))

Twos

In [29]:
dataset_2 = dataset[np.where(labels==2)]
labels_2 = labels[np.where(labels==2)]
r_index_2 = np.arange(labels_2.shape[0])
np.random.shuffle(r_index_2)
dataset_2 = dataset_2[r_index_2][:8000]
labels_2 = labels_2[r_index_2][:8000]
dataset_2.shape, labels_2.shape
Out[29]:
((8000, 784), (8000,))
In [30]:
index = 0
h, v = 22, 63
fig = plt.figure(figsize=(28, 12))
fig.subplots_adjust(hspace=0.1, wspace=0.1)
for i in range(1, h*v+1):
    test_image = dataset_2[(h*v+1)*index+i-1:(h*v+1)*index+i]
    ax = fig.add_subplot(h, v, i)
    ax.axis('off')
    ax.imshow(test_image.reshape(28, 28))

Threes

In [31]:
dataset_3 = dataset[np.where(labels==3)]
labels_3 = labels[np.where(labels==3)]
r_index_3 = np.arange(labels_3.shape[0])
np.random.shuffle(r_index_3)
dataset_3 = dataset_3[r_index_3][:8000]
labels_3 = labels_3[r_index_3][:8000]
dataset_3.shape, labels_3.shape
Out[31]:
((8000, 784), (8000,))
In [32]:
index = 0
h, v = 22, 63
fig = plt.figure(figsize=(28, 12))
fig.subplots_adjust(hspace=0.1, wspace=0.1)
for i in range(1, h*v+1):
    test_image = dataset_3[(h*v+1)*index+i-1:(h*v+1)*index+i]
    ax = fig.add_subplot(h, v, i)
    ax.axis('off')
    ax.imshow(test_image.reshape(28, 28))

Fours

In [34]:
dataset_4 = dataset[np.where(labels==4)]
labels_4 = labels[np.where(labels==4)]
r_index_4 = np.arange(labels_4.shape[0])
np.random.shuffle(r_index_4)
dataset_4 = dataset_4[r_index_4][:8000]
labels_4 = labels_4[r_index_4][:8000]
dataset_4.shape, labels_4.shape
Out[34]:
((8000, 784), (8000,))
In [35]:
index = 0
h, v = 22, 63
fig = plt.figure(figsize=(28, 12))
fig.subplots_adjust(hspace=0.1, wspace=0.1)
for i in range(1, h*v+1):
    test_image = dataset_4[(h*v+1)*index+i-1:(h*v+1)*index+i]
    ax = fig.add_subplot(h, v, i)
    ax.axis('off')
    ax.imshow(test_image.reshape(28, 28))

Fives

In [37]:
dataset_5 = dataset[np.where(labels==5)]
labels_5 = labels[np.where(labels==5)]
r_index_5 = np.arange(labels_5.shape[0])
np.random.shuffle(r_index_5)
dataset_5 = dataset_5[r_index_5][:8000]
labels_5 = labels_5[r_index_5][:8000]
dataset_5.shape, labels_5.shape
Out[37]:
((8000, 784), (8000,))
In [38]:
index = 0
h, v = 22, 63
fig = plt.figure(figsize=(28, 12))
fig.subplots_adjust(hspace=0.1, wspace=0.1)
for i in range(1, h*v+1):
    test_image = dataset_5[(h*v+1)*index+i-1:(h*v+1)*index+i]
    ax = fig.add_subplot(h, v, i)
    ax.axis('off')
    ax.imshow(test_image.reshape(28, 28))

Sixes

In [42]:
dataset_6 = dataset[np.where(labels==6)]
labels_6 = labels[np.where(labels==6)]
r_index_6 = np.arange(labels_6.shape[0])
np.random.shuffle(r_index_6)
dataset_6 = dataset_6[r_index_6][:8000]
labels_6 = labels_6[r_index_6][:8000]
dataset_6.shape, labels_6.shape
Out[42]:
((8000, 784), (8000,))
In [43]:
index = 0
h, v = 22, 63
fig = plt.figure(figsize=(28, 12))
fig.subplots_adjust(hspace=0.1, wspace=0.1)
for i in range(1, h*v+1):
    test_image = dataset_6[(h*v+1)*index+i-1:(h*v+1)*index+i]
    ax = fig.add_subplot(h, v, i)
    ax.axis('off')
    ax.imshow(test_image.reshape(28, 28))

Sevens

In [44]:
dataset_7 = dataset[np.where(labels==7)]
labels_7 = labels[np.where(labels==7)]
r_index_7 = np.arange(labels_7.shape[0])
np.random.shuffle(r_index_7)
dataset_7 = dataset_7[r_index_7][:8000]
labels_7 = labels_7[r_index_7][:8000]
dataset_7.shape, labels_7.shape
Out[44]:
((8000, 784), (8000,))
In [45]:
index = 0
h, v = 22, 63
fig = plt.figure(figsize=(28, 12))
fig.subplots_adjust(hspace=0.1, wspace=0.1)
for i in range(1, h*v+1):
    test_image = dataset_7[(h*v+1)*index+i-1:(h*v+1)*index+i]
    ax = fig.add_subplot(h, v, i)
    ax.axis('off')
    ax.imshow(test_image.reshape(28, 28))

Eights

In [46]:
dataset_8 = dataset[np.where(labels==8)]
labels_8 = labels[np.where(labels==8)]
r_index_8 = np.arange(labels_8.shape[0])
np.random.shuffle(r_index_8)
dataset_8 = dataset_8[r_index_8][:8000]
labels_8 = labels_8[r_index_8][:8000]
dataset_8.shape, labels_8.shape
Out[46]:
((8000, 784), (8000,))
In [47]:
index = 0
h, v = 22, 63
fig = plt.figure(figsize=(28, 12))
fig.subplots_adjust(hspace=0.1, wspace=0.1)
for i in range(1, h*v+1):
    test_image = dataset_8[(h*v+1)*index+i-1:(h*v+1)*index+i]
    ax = fig.add_subplot(h, v, i)
    ax.axis('off')
    ax.imshow(test_image.reshape(28, 28))

Nines

In [48]:
dataset_9 = dataset[np.where(labels==9)]
labels_9 = labels[np.where(labels==9)]
r_index_9 = np.arange(labels_9.shape[0])
np.random.shuffle(r_index_9)
dataset_9 = dataset_9[r_index_9][:8000]
labels_9 = labels_9[r_index_9][:8000]
dataset_9.shape, labels_9.shape
Out[48]:
((8000, 784), (8000,))
In [50]:
index = 0
h, v = 22, 63
fig = plt.figure(figsize=(28, 12))
fig.subplots_adjust(hspace=0.1, wspace=0.1)
for i in range(1, h*v+1):
    test_image = dataset_9[(h*v+1)*index+i-1:(h*v+1)*index+i]
    ax = fig.add_subplot(h, v, i)
    ax.axis('off')
    ax.imshow(test_image.reshape(28, 28))

Combine

In [51]:
balanced_dataset = np.vstack((dataset_0, dataset_1, dataset_2, dataset_3, dataset_4, dataset_5, dataset_6, dataset_7, dataset_8, dataset_9)) 
balanced_labels = np.concatenate((labels_0, labels_1, labels_2, labels_3, labels_4, labels_5, labels_6, labels_7, labels_8, labels_9))
In [58]:
rand_index = np.arange(balanced_labels.shape[0])
np.random.shuffle(rand_index)
balanced_dataset = balanced_dataset[rand_index]
balanced_labels = balanced_labels[rand_index]
balanced_dataset.shape, balanced_labels.shape
Out[58]:
((80000, 784), (80000,))
In [59]:
index = 0
h, v = 1, 63
fig = plt.figure(figsize=(28, 1))
fig.subplots_adjust(hspace=0.1, wspace=0.1)
for i in range(1, h*v+1):
    test_image = balanced_dataset[(h*v+1)*index+i-1:(h*v+1)*index+i]
    print(balanced_labels[(h*v+1)*index+i-1:(h*v+1)*index+i])
    ax = fig.add_subplot(h, v, i)
    ax.axis('off')
    ax.imshow(test_image.reshape(28, 28))
[1]
[6]
[6]
[4]
[3]
[7]
[3]
[8]
[7]
[7]
[9]
[3]
[8]
[1]
[9]
[2]
[1]
[4]
[8]
[6]
[3]
[1]
[0]
[1]
[1]
[0]
[0]
[7]
[4]
[0]
[5]
[7]
[2]
[2]
[2]
[8]
[7]
[7]
[5]
[8]
[1]
[1]
[5]
[4]
[1]
[2]
[0]
[4]
[8]
[6]
[4]
[1]
[3]
[3]
[2]
[6]
[6]
[9]
[3]
[8]
[9]
[5]
[3]
In [60]:
from sklearn.cross_validation import train_test_split
In [66]:
dataset_train, dataset_test, labels_train, labels_test = train_test_split(balanced_dataset, balanced_labels, test_size=0.4)
dataset_test, dataset_val, labels_test, labels_val = train_test_split(dataset_test, labels_test, test_size=0.5)
In [68]:
index = 0
h, v = 1, 63
fig = plt.figure(figsize=(28, 1))
fig.subplots_adjust(hspace=0.1, wspace=0.1)
for i in range(1, h*v+1):
    test_image = dataset_train[(h*v+1)*index+i-1:(h*v+1)*index+i]
    print(labels_train[(h*v+1)*index+i-1:(h*v+1)*index+i])
    ax = fig.add_subplot(h, v, i)
    ax.axis('off')
    ax.imshow(test_image.reshape(28, 28))
[7]
[1]
[2]
[1]
[3]
[7]
[6]
[7]
[5]
[7]
[8]
[4]
[0]
[5]
[6]
[4]
[0]
[0]
[9]
[6]
[6]
[8]
[0]
[9]
[9]
[6]
[8]
[8]
[5]
[3]
[1]
[9]
[4]
[4]
[5]
[4]
[2]
[5]
[8]
[9]
[6]
[8]
[0]
[8]
[3]
[0]
[3]
[0]
[6]
[9]
[0]
[0]
[7]
[3]
[2]
[8]
[3]
[1]
[9]
[5]
[3]
[7]
[6]

Save it

In [71]:
# dataset_train, dataset_test, labels_train, labels_test = train_test_split(balanced_dataset, balanced_labels, test_size=0.4)
# dataset_test, dataset_val, labels_test, labels_val = train_test_split(dataset_test, labels_test, test_size=0.5)

dataset_path = 'digits/data/balanced_dataset.h5'
with h5py.File(dataset_path, 'w') as hf:
    hf.create_dataset('train_data', data=dataset_train)
    hf.create_dataset('train_labels', data=labels_train)
    hf.create_dataset('test_data', data=dataset_test)
    hf.create_dataset('test_labels', data=labels_test)
    hf.create_dataset('val_data', data=dataset_val)
    hf.create_dataset('val_labels', data=labels_val)

Test - open it

In [72]:
ds = 'digits/data/balanced_dataset.h5'
with h5py.File(ds, 'r') as hf:
    train_data = hf['train_data'][:]
    train_labels = hf['train_labels'][:]
    test_data = hf['test_data'][:]
    test_labels = hf['test_labels'][:]
    val_data = hf['val_data'][:]
    val_labels = hf['val_labels'][:]
In [ ]:
 
In [ ]: